import datasets
from datasets import load_dataset
from datasets import load_from_disk
from datasets import Dataset, DatasetDict
from distilabel.distiset import Distiset
import logging
import random
import time
from openai import OpenAI

def process_boolean_list(bool_list):
    
    if not all(isinstance(x, bool) for x in bool_list):
        raise ValueError("(True/False)")
    
  
    result_list = []
    
    
    for i in range(0, len(bool_list), 8):
        group = bool_list[i:i+8]
        #
        true_count = sum(group)
       
        result_list.append(true_count > len(group)/2)
    
    
    true_proportion = sum(result_list) / len(result_list) if result_list else 0
    
    return result_list, true_proportion


distiset1 = Distiset.load_from_disk(".../SFTData/entire_ToMATO_majority")
dataset = distiset1['default']['train']
print(dataset)




logging.basicConfig(
    filename=f"log_hitom_ToMATO_majority.log",      
    level=logging.INFO,         
    format='%(asctime)s - %(levelname)s - %(message)s'
    )

def Evaluate(prompt:str, model_name, max_retries=30):
        
    for i in range(max_retries):
        try:
            client = OpenAI(
                api_key="",
                base_url="",
            )

            completion = client.chat.completions.create(
                model=model_name,  
                messages=[
                    {'role': 'system', 'content': 'You are a helpful assistant.'},
                    {'role': 'user', 'content': prompt}
                ],
                temperature = 0
            )
            print(completion.choices[0].message.content)
            output = completion.choices[0].message.content
                
            return output
        except Exception as e:
            if i == max_retries - 1:  
                raise  
            else:
                sleep_time = (2 ** i) + random.random() 
                time.sleep(sleep_time)    

true_list = []
for example in dataset:
    generation = example["generation"]
    answer = example["a_str"]
    prompt = f"""\
This is someone's response [{generation}]:


This is the correct answer:

[{answer}]

Is final answer correct? Output 'True' or 'False' only.
"""

    graded_answer = Evaluate(prompt, 'deepseek-v3')
    logging.info(graded_answer)
    print(graded_answer)
    def str_to_bool(s):
        return s.strip().lower() == 'true'
    graded_answer = str_to_bool(graded_answer)
    true_list.append(graded_answer)

result_list, true_proportion = process_boolean_list(true_list)

print(result_list)
print(true_proportion)
